In [11]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import networkx as nx
import pylab as plt

from collections import Counter
import community

In [192]:
# get meme_names
meme_name="thevoice"

results_path="/home/clemsos/Dev/mitras/results/"
meme_path=results_path+"/"+meme_name

Load data

Parse data from file into a directed weighted graph


In [193]:
meme_graph_csv=meme_path+"/"+meme_name+"_edges.csv"
# meme_graph_csv=meme_path+"/"+meme_name+"_d3graph.csv"

with open(meme_graph_csv, 'rb') as edgefile:
    edgefile.next() # skip headers
    edgecsv=csv.reader(edgefile)
    
    # edges=[ str(str(edge[0])+","+str(edge[1])+","+str(edge[2])) for edge in edgecsv]
    
    edges=[(e[0], e[1]) for e in edgecsv]
    edges_weighted=[str(p[0][0]+" "+p[0][1]+" "+str(p[1])) for p in Counter(edges).most_common()] # if p[1] > 1]
        
    
    print "Edges (raw files) : %d edges"%len(edges)
    print "Weighted edges %d"%len(edges_weighted)

    G = nx.read_weighted_edgelist(edges_weighted, nodetype=str, delimiter=" ",create_using=nx.DiGraph())
    
    # G=nx.read_weighted_edgelist(edgefile, create_using=nx.DiGraph(), delimiter=",")


Edges (raw files) : 34374 edges
Weighted edges 32105

 Connected components

A graph is connected if there is a path between every pair of nodes in the graph. A connected component is a subset of the nodes where:

  1. A path exists between every pair in the subset.
  2. The subset is not part of a larger set with the above property.

In [195]:
G_components = nx.connected_component_subgraphs(G.to_undirected())
print "Number of components: %d"%nx.number_connected_components(G.to_undirected())

# print "Number of strongly connected components: %d"%nx.number_strongly_connected_components(G)

main_components=[g for g in G_components[0:5]]
for i,g in enumerate(main_components): 
    print "cluster %i : %.3f %% "%(i,(float(g.number_of_nodes())/G.number_of_nodes())*100)


Number of components: 1305
cluster 0 : 81.787 % 
cluster 1 : 0.530 % 
cluster 2 : 0.204 % 
cluster 3 : 0.182 % 
cluster 4 : 0.178 % 

Strongly connected component

The most connected component shows


In [196]:
G_mc = G_components[0]
G_mc_per_total= (float(G_mc.number_of_nodes())/G.number_of_nodes())*100
print "Biggest connected component is %.2f %% of the graph" % G_mc_per_total

nx.write_graphml(G_mc, meme_path+"/"+meme_name+"_main_component.graphml")
# use the most connected component as the main one
# G = G_components[0]


Biggest connected component is 81.79 % of the graph

Partition and communities

Modularity

Measures how well a network decomposes into modular communities.

A high modularity score indicates sophisticated internal structure. This structure, often called a community structure, describes how the the network is compartmentalized into sub-networks. These sub-networks (or communities) have been shown to have significant real-world meaning.


In [168]:
# create dendogram
dendo = community.generate_dendogram(G.to_undirected())

for level in range(len(dendo) - 1) :
    print "partition at level", level #, "is", community.partition_at_level(dendo, level)


partition at level 0
partition at level 1
partition at level 2

In [172]:
# from http://perso.crans.org/aynaud/communities/ 

# Best partition
best_partition = community.best_partition(G.to_undirected()) 
modularity=community.modularity(best_partition,G.to_undirected())
print "Modularity of the best partition: %f"%modularity
print "Number of nodes in the best partition : ", len(set(best_partition.values()))


Modularity of the best partition: 0.923318
Number of nodes in the best partition :  719

In [173]:
G_ok=community.induced_graph(best_partition,G.to_undirected())
nx.write_graphml(G_ok, meme_path+"/"+meme_name+"_best_partition.graphml")

# nx.draw(G_ok)
# print  best_partition.values()
# for index in best_partition:
    # print best_partition[index]
    #print a
    # print "modularity: %.3f %%"%(community.modularity(communities, G.to_undirected()))    
# Modularity
# partitions = community.best_partition(G.to_undirected()) 
# print partition.values()

Nodes measures

Betweenness centrality

Betweenness centrality is a measure of a node's centrality in a network. It is equal to the number of shortest paths from all vertices to all others that pass through that node. Betweenness centrality is a more useful measure (than just connectivity) of both the load and importance of a node. The former is more global to the network, whereas the latter is only a local effect.


In [186]:
# Create ordered tuple of centrality data
cent_dict=nx.betweenness_centrality (G.to_undirected())
cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]

# Sort in descending order 
cent_items.sort() 
cent_items.reverse() 

# Highest centrality 
for c in cent_items[0:5]:
    print "Highest betweeness centrality :%f"%c[0]


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-186-c073715d9603> in <module>()
      1 # Create ordered tuple of centrality data
----> 2 cent_dict=nx.betweenness_centrality (G.to_undirected())
      3 cent_items=[(b,a) for (a,b) in cent_dict.iteritems()]
      4 
      5 # Sort in descending order

/usr/local/lib/python2.7/dist-packages/networkx/algorithms/centrality/betweenness.pyc in betweenness_centrality(G, k, normalized, weight, endpoints, seed)
    111             betweenness=_accumulate_endpoints(betweenness,S,P,sigma,s)
    112         else:
--> 113             betweenness=_accumulate_basic(betweenness,S,P,sigma,s)
    114     # rescaling
    115     betweenness=_rescale(betweenness, len(G),

/usr/local/lib/python2.7/dist-packages/networkx/algorithms/centrality/betweenness.pyc in _accumulate_basic(betweenness, S, P, sigma, s)
    265     while S:
    266         w=S.pop()
--> 267         coeff=(1.0+delta[w])/sigma[w]
    268         for v in P[w]:
    269             delta[v] += sigma[v]*coeff

KeyboardInterrupt: 

Node in/out degree


In [194]:
%pylab inline

in_degrees = G.in_degree() # dictionary node:degree
in_count=[v for v in  Counter(in_degrees.values()).most_common() if v[0] > 2] 
in_values=[v[0] for v in in_count]
in_hist=[v[1] for v in in_count]

out_degrees = G.out_degree() # dictionary node:degree
out_count=[v for v in  Counter(out_degrees.values()).most_common() if v[0] > 2] 
out_values=[v[0] for v in out_count]
out_hist=[v[1] for v in out_count]

in_degree_sequence=sorted(G.in_degree().values(),reverse=True) # degree sequence
out_degree_sequence=sorted(G.out_degree().values(),reverse=True) # degree sequence
dmax=max(degree_sequence)
plt.loglog(in_degree_sequence,'b-',marker='o')
plt.loglog(out_degree_sequence,'r-',marker='v')

# plt.figure()
# plt.plot(in_values,in_hist,'rx') # in-degree
# plt.plot(out_values,out_hist,'bv') # out-degree
plt.legend(['In-degree','Out-degree'])
plt.xlabel('Rank')
plt.ylabel('Degree')
plt.title(meme_name+' degree distribution')

for d in Counter(in_degrees.values()).most_common(5):
    print "Highest in-degree value : %d"% d[1]

for d in Counter(out_degrees.values()).most_common(5):
    print "Highest out-degree value : %d"% d[1]


Populating the interactive namespace from numpy and matplotlib
Highest in-degree value : 16515
Highest in-degree value : 4991
Highest in-degree value : 1081
Highest in-degree value : 405
Highest in-degree value : 189
Highest out-degree value : 12322
Highest out-degree value : 4297
Highest out-degree value : 4180
Highest out-degree value : 1599
Highest out-degree value : 615
WARNING: pylab import has clobbered these variables: ['e']
`%pylab --no-import-all` prevents importing * from pylab and numpy

In [29]:
# draw partition graph
size = float(len(set(partition.values())))
pos = nx.spring_layout(G.to_undirected())
count = 0.
for com in set(partition.values()) :
     count = count + 1.
     list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
     nx.draw_networkx_nodes(G.to_undirected(), pos, list_nodes, node_size = 20, node_color = str(count / size))

# nx.draw_networkx_edges(G.to_undirected(),pos, alpha=0.5)


Out[29]:
<matplotlib.collections.LineCollection at 0xcccc490>

Clustering coeficient

A clustering coefficient is a measure of the degree to which nodes in a graph tend to cluster together. Evidence suggests that in most real-world networks, and in particular social networks, nodes tend to create tightly knit groups characterised by a relatively high density of ti

The global clustering coefficient is based on triplets of nodes.

Network average clustering coefficient

A graph is considered small-world :

  • if its average clustering coefficient C is significantly higher than a random graph constructed on the same vertex set
  • if the graph has approximately the same mean-shortest path length as its corresponding random graph.

In [65]:
N,K = G.order(), G.size()
print "Nodes: ", N
print "Edges: ", K

avg_deg = float(K)/N
print "Average degree: ", avg_deg

# Clustering coefficient of all nodes (in a dictionary)
ccs = nx.clustering(G.to_undirected())

# Average clustering coefficient
avg_clust_coef = sum(ccs.values()) / len(ccs)
# also : nx.algorithms.cluster.average_clustering(G.to_undirected())
print "Average clustering coeficient: %f"%avg_clust_coef

# random graph
#  G_random=nx.erdos_renyi_graph(len(G.nodes()), 0.5)
# nx.algorithms.cluster.average_clustering(G_random)


Nodes:  4247
Edges:  4613
Average degree:  1.08617847893
Average clustering coeficient: 0.080233

Cliques

A clique is a set of nodes where every node is connected to every other node


In [1]:
cliques=[c for c in nx.find_cliques(G.to_undirected())]
cliques_length=[len(c) for c in nx.find_cliques(G.to_undirected())]

print "total cliques: %d"%len(cliques_length)
print "max clique length: %d"%max(cliques_length)
print "average clique length: %d"%average(cliques_length)

cliques_dist=Counter(cliques_length).most_common()

title="Cliques distribution"

plt.bar([c[0]for c in cliques_dist ],[c[1]for c in cliques_dist ])
plt.grid(True,linestyle='-',color='0.75')
plt.ylabel('Counts')
plt.title(title,fontstyle='italic')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-2eb16de3f264> in <module>()
----> 1 cliques=[c for c in nx.find_cliques(G.to_undirected())]
      2 cliques_length=[len(c) for c in nx.find_cliques(G.to_undirected())]
      3 
      4 print "total cliques: %d"%len(cliques_length)
      5 print "max clique length: %d"%max(cliques_length)

NameError: name 'nx' is not defined

In [ ]:

Average path length

Average path length is a concept in network topology that is defined as the average number of steps along the shortest paths for all possible pairs of network nodes. It is a measure of the efficiency of information or mass transport on a network.


In [176]:
# nx.average_shortest_path_length(G)

# G_components = nx.connected_component_subgraphs(G.to_undirected())

#for g in G_components:
# print(nx.average_shortest_path_length(g))


---------------------------------------------------------------------------
NetworkXError                             Traceback (most recent call last)
<ipython-input-176-f8e4c08516b9> in <module>()
----> 1 nx.average_shortest_path_length(G)
      2 
      3 # G_components = nx.connected_component_subgraphs(G.to_undirected())
      4 
      5 #for g in G_components:

/usr/local/lib/python2.7/dist-packages/networkx/algorithms/shortest_paths/generic.pyc in average_shortest_path_length(G, weight)
    312     if G.is_directed():
    313         if not nx.is_weakly_connected(G):
--> 314             raise nx.NetworkXError("Graph is not connected.")
    315     else:
    316         if not nx.is_connected(G):

NetworkXError: Graph is not connected.

In [247]:

Giant component

A giant component is a connected component of a given random graph that contains a constant fraction of the entire graph's vertices.


In [218]:
try:
    from networkx import graphviz_layout
    layout=nx.graphviz_layout
except ImportError:
    print "PyGraphviz not found; drawing with spring layout; will be slow."
    layout=nx.spring_layout
    
region=220 # for pylab 2x2 subplot layout
plt.subplots_adjust(left=0,right=1,bottom=0,top=0.95,wspace=0.01,hspace=0.01)

pvals=[0.003, 0.006, 0.008, 0.015]

for p in pvals:
    # G=nx.binomial_graph(n,p)
    pos=layout(G)
    region+=1
    plt.subplot(region)
    plt.title("p = %6.3f"%(p))
    nx.draw(G,pos,
            with_labels=False,
            node_size=10
            )
    
    # identify largest connected component
    Gcc=nx.connected_component_subgraphs(G)
    G0=Gcc[0]
    nx.draw_networkx_edges(G0,pos,
                           with_labels=False,
                           edge_color='r',
                           width=6.0
                        )
    # show other connected components
    for Gi in Gcc[1:]:
       if len(Gi)>1:
          nx.draw_networkx_edges(Gi,pos,
                                 with_labels=False,
                                 edge_color='r',
                                 alpha=0.3,
                                 width=5.0
                                 )


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-218-0d8a6022506b> in <module>()
     19     nx.draw(G,pos,
     20             with_labels=False,
---> 21             node_size=10
     22             )
     23 

/usr/local/lib/python2.7/dist-packages/networkx/drawing/nx_pylab.pyc in draw(G, pos, ax, hold, **kwds)
    124         plt.hold(h)
    125     try:
--> 126         draw_networkx(G,pos=pos,ax=ax,**kwds)
    127         ax.set_axis_off()
    128         plt.draw_if_interactive()

/usr/local/lib/python2.7/dist-packages/networkx/drawing/nx_pylab.pyc in draw_networkx(G, pos, with_labels, **kwds)
    258 
    259     node_collection=draw_networkx_nodes(G, pos, **kwds)
--> 260     edge_collection=draw_networkx_edges(G, pos, **kwds)
    261     if with_labels:
    262         draw_networkx_labels(G, pos, **kwds)

/usr/local/lib/python2.7/dist-packages/networkx/drawing/nx_pylab.pyc in draw_networkx_edges(G, pos, edgelist, width, edge_color, style, alpha, edge_cmap, edge_vmin, edge_vmax, ax, arrows, label, **kwds)
    529                                      antialiaseds = (1,),
    530                                      linestyle    = style,
--> 531                                      transOffset = ax.transData,
    532                                      )
    533 

/usr/lib/pymodules/python2.7/matplotlib/collections.pyc in __init__(self, segments, linewidths, colors, antialiaseds, linestyles, offsets, transOffset, norm, cmap, pickradius, zorder, **kwargs)
   1014             **kwargs)
   1015 
-> 1016         self.set_segments(segments)
   1017 
   1018     def set_segments(self, segments):

/usr/lib/pymodules/python2.7/matplotlib/collections.pyc in set_segments(self, segments)
   1028         if self._uniform_offsets is not None:
   1029             _segments = self._add_offsets(_segments)
-> 1030         self._paths = [mpath.Path(seg) for seg in _segments]
   1031 
   1032     set_verts = set_segments  # for compatibility with PolyCollection

/usr/lib/pymodules/python2.7/matplotlib/path.pyc in __init__(self, vertices, codes, _interpolation_steps, closed, readonly)
    151         self._codes = codes
    152         self._interpolation_steps = _interpolation_steps
--> 153         self._update_values()
    154 
    155         if readonly:

/usr/lib/pymodules/python2.7/matplotlib/path.pyc in _update_values(self)
    199             (self._codes is None or np.all(self._codes <= Path.LINETO))))
    200         self._simplify_threshold = rcParams['path.simplify_threshold']
--> 201         self._has_nonfinite = not np.isfinite(self._vertices).all()
    202 
    203     @property

KeyboardInterrupt: 
/usr/lib/pymodules/python2.7/pygraphviz/agraph.py:1281: RuntimeWarning: Fontconfig warning: "/etc/fonts/conf.d/25-wqy-zenhei.conf", line 11: Having multiple values in <test> isn't supported and may not work as expected
Fontconfig warning: "/etc/fonts/conf.d/53-monospace-lcd-filter.conf", line 10: Having multiple values in <test> isn't supported and may not work as expected

  warnings.warn("".join(errors),RuntimeWarning)

In [149]:
# from http://perso.crans.org/aynaud/communities/ 
partition = community.best_partition(G.to_undirected()) 
print "Partitions found: ", len(set(partition.values())) 

# Modularity
modularity=community.modularity(partition,G.to_undirected())
print "Modularity: %f"%modularity


Partitions found:  313
Modularity: 0.888809

In / out degree

Detect communities


In [122]:
size = float(len(set(partition.values())))
pos = nx.spring_layout(G.to_undirected())
count = 0.
for com in set(partition.values()) :
     count = count + 1.
     list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
     nx.draw_networkx_nodes(G.to_undirected(), pos, list_nodes, node_size = 20, node_color = str(count / size))

nx.draw_networkx_edges(G.to_undirected(),pos, alpha=0.5)


Partitions found:  257
0.877925495457
Out[122]:
<matplotlib.collections.LineCollection at 0x8672510>

 Dendogram

A dendogram is a tree and each level is a partition of the graph nodes. Level 0 is the first partition, which contains the smallest communities, and the best is len(dendogram) - 1. The higher the level is, the bigger are the communities


In [119]:
import numpy as np
ddg=community.generate_dendogram(G.to_undirected())
len(ddg)
#for level in range(len(ddg) - 1) :
#     print "partition at level", level, "is", community.partition_at_level(ddg, level)


Out[119]:
3

In [8]:
# Betweenness centrality
# bet_cen = nx.betweenness_centrality(G_mc)
# print "average_betweenness_centrality %f"%bet_cen

# Closeness centrality
# clo_cen = nx.closeness_centrality(G_mc)
# print "closeness_centrality %f"%clo_cen

# Eigenvector centrality
# eig_cen = nx.eigenvector_centrality(G_mc)
# print eig_cen
#print "Eigenvector centrality : %f"%eig_cen[0]